1. One dimensional Partial Dependence Plot

#install.packages("randomForest")
#install.packages("pdp")
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(pdp)
library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2)
library(reshape2)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(randomForestSRC)
## 
##  randomForestSRC 3.2.3 
##  
##  Type rfsrc.news() to see new features, changes, and bug fixes. 
## 
## 
## Attaching package: 'randomForestSRC'
## The following object is masked from 'package:pdp':
## 
##     partial
days <- read.csv("Datos/day.csv")
hour <- read.csv("Datos/hour.csv")

days$dteday <- as_date(days$dteday)
days_since <- select(days, workingday, holiday, temp, hum, windspeed, cnt)
days_since$days_since_2011 <- int_length(interval(ymd("2011-01-01"), days$dteday)) / (3600*24)
days_since$SUMMER <- ifelse(days$season == 3, 1, 0)
days_since$FALL <- ifelse(days$season == 4, 1, 0)
days_since$WINTER <- ifelse(days$season == 1, 1, 0)
days_since$MISTY <- ifelse(days$weathersit == 2, 1, 0)
days_since$RAIN <- ifelse(days$weathersit == 3 | days$weathersit == 4, 1, 0)
days_since$temp <- days_since$temp * 47 - 8
days_since$hum <- days_since$hum * 100
days_since$windspeed <- days_since$windspeed * 67

rf <- rfsrc(cnt~., data=days_since)

results <- select(days_since, days_since_2011, temp, hum, windspeed, cnt)
nr <- nrow(days_since)
for(c in names(results)[1:4])
{
  for(i in 1:nr){
    r <- days_since
    r[[c]] <- days_since[[c]][i]
    sal <- predict(rf, r)$predicted
    results[[c]][i] <- sum(sal) / nr
  }
}
p1 = ggplot(days_since, aes(x = days_since_2011, y = results$days_since_2011)) +
  geom_line(color = "#9FBFB6") + 
  ylim(c(0,6000)) + 
  geom_rug(alpha = 0.7,color = '#D9A282', sides = "b") +
  ylab("Prediction") + 
  xlab("Days since 2011")

p2 = ggplot(days_since, aes(x = temp, y=results$temp)) + 
  geom_line(color = "#9FBFB6") + 
  ylim(c(0,6000)) +
  geom_rug(alpha = 0.7, color = '#D9A282',sides = "b")+ 
  xlab("Temperature") +ylab("Prediction")

p3 = ggplot(days_since, aes(x = hum, y = results$hum)) + 
  geom_line(color = "#9FBFB6") + 
  ylim(c(0,6000)) +
  geom_rug(alpha = 0.7, color = '#D9A282',sides = "b") + 
  xlab("Humidity")+ ylab("Prediction")

p4 = ggplot(days_since, aes(x = windspeed, y = results$windspeed)) + 
  geom_line(color = "#9FBFB6") + 
  ylim(c(0,6000)) +
  geom_rug(alpha = 0.7, color = '#D9A282',sides = "b") + 
  xlab("Wind speed") + ylab("Prediction")

subplot(p1, p2, p3, p4, shareY = TRUE, shareX = FALSE, titleX = TRUE)

2. Bidimensional Partial Dependency Plot.

sampled <- sample_n(days_since, 40)
temp <- sampled$temp
hum <- sampled$hum
th <- inner_join(data.frame(temp),data.frame(hum), by=character())
## Warning: Using `by = character()` to perform a cross join was deprecated in dplyr 1.1.0.
## ℹ Please use `cross_join()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
th$p <- 0

for(i in 1:nrow(th)){
  r <- days_since
  r[["temp"]] <- th[["temp"]][i]
  r[["hum"]] <- th[["hum"]][i]
  
  sal <- predict(rf, r)$predicted
  th[["p"]][i] <- sum(sal) / nr
}
p = ggplot(th, aes(x = temp, y = hum)) + 
  geom_tile(aes(fill = p), width = 10, height = 15) +
  geom_rug(alpha = 0.01) +
  xlab("Temperature") +
  ylab("Humidity") +
  scale_fill_gradientn(name = "Number of bikes",
                       colors = c("#9FBFB6", "white", "#D9A282"),
                       values = scales::rescale(c(10, 50, 90))) +
  theme_minimal()
p

3. PDP to explain the price of a house.

set.seed(100)

d <- read.csv("Datos/kc_house_data.csv")

sampled <- sample_n(d, 1000)

sampled <- select(sampled, bedrooms, bathrooms, sqft_living, sqft_lot, floors, yr_built, price)

rf <- rfsrc(price~., data=sampled)

results <- select(sampled, bedrooms, bathrooms, sqft_living, floors, price)
nr <- nrow(sampled)
for(c in names(results)[1:4])
{
  for(i in 1:nr){
    r <- sampled
    r[[c]] <- sampled[[c]][i]
    sal <- predict(rf, r)$predicted
    results[[c]][i] <- sum(sal) / nr
  }
}
plot1 = ggplot(sampled, aes(x = bedrooms, y = results$bedrooms)) + 
  geom_line(color = '#9FBFB6') + 
  geom_rug(alpha = 0.7, color = '#D9A282',sides = "b") + 
  ylab("Prediction") + xlab("Bedrooms")

plot2 = ggplot(sampled, aes(x = bathrooms, y = results$bathrooms)) + 
  geom_line(color = '#9FBFB6') + 
  geom_rug(alpha = 0.7, color = '#D9A282',sides = "b") + 
  xlab("Bathrooms")

plot3 = ggplot(sampled, aes(x = sqft_living, y = results$sqft_living)) + 
  geom_line(color = '#9FBFB6') + 
  geom_rug(alpha = 0.7,color = '#D9A282', sides = "b") + 
  xlab("Sqft Living")

plot4 = ggplot(sampled, aes(x = floors, y = results$floors)) + 
  geom_line(color = '#9FBFB6') + 
  geom_rug(alpha = 0.7, color = '#D9A282',sides = "b")+ 
  xlab("Floors")

subplot(plot1, plot2, plot3, plot4, shareX = FALSE, titleX = TRUE)